{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH\n",
    "sys.path.append('/home/mink/lib/ai4eutils')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "from shutil import copyfile\n",
    "from multiprocessing.pool import ThreadPool\n",
    "from datetime import datetime\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "import path_utils  # ai4eutils\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import process_sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# au_nt_gov_kerr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'au_nt_gov_kerr'\n",
    "\n",
    "container_root = '/mink_disk_0/camtraps/nt-gov-au/'  \n",
    "\n",
    "path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' \n",
    "path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 0 - Add an entry to the `datasets` table\n",
    "\n",
    "Done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1 - Prepare the `sequence` objects to insert into the database\n",
    "\n",
    "Last level of directory includes the species name. Location is a part of the file name."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19353"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "19053"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paths = path_utils.recursive_file_list(container_root)\n",
    "len(paths)\n",
    "paths = sorted([p.split(container_root)[1] for p in paths if path_utils.is_image_file(p)])\n",
    "len(paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Training Images (Custom Vision)/Brush-tailed Rabbit-rat/Brush-tailed Rabbit-ratIMG_0008.JPG'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "'brush-tailed rabbit-rat'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paths[1000]\n",
    "paths[1000].split('/')[-2].lower().split('_')[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# typos etc\n",
    "species_name_mapping = {\n",
    "    'rainbow bee eater': 'rainbow bee-eater',\n",
    "    'australian owlet nightjar': 'australian owlet-nightjar',\n",
    "    'bar shouldered dove': 'bar-shouldered dove',\n",
    "    'black wallaro': 'black wallaroo',\n",
    "    'blue winged kookaburra': 'blue-winged kookaburra',\n",
    "    'black footed tree rat': 'black-footed tree-rat',\n",
    "    'brush tailed rabbit rat': 'brush-tailed rabbit rat',\n",
    "    'brush-tailed rabbit-rat': 'brush-tailed rabbit rat',\n",
    "    'brush tailed mulgara': 'brush-tailed mulgara',\n",
    "    'common brushtailed possum': 'common brushtail possum',\n",
    "    'fawn antechnius': 'fawn antechinus',\n",
    "    'common bronzewing pigeon': 'common bronzewing',\n",
    "    'common rock-rat': 'common rock rat',\n",
    "    'gregory np 2017': None,  # this is a national park\n",
    "    'grey shrike thrush': 'grey shrike-thrush',\n",
    "    'middle arm': None,\n",
    "    'middle arm 2019': None,\n",
    "    'magpie lark': 'magpie-lark',\n",
    "    'mtf- gunn point 2018': None,\n",
    "    'willy wagtail': 'willie wagtail',\n",
    "    'nitmiluk': None,\n",
    "    'white-bellied sea-eagle- incorrect photo titles': 'white-bellied sea-eagle',\n",
    "    'rainbow pita': 'rainbow pitta',\n",
    "    'sandy inland mounse': 'sandy inland mouse',\n",
    "    'variouslocations': None,\n",
    "    'set up': 'human',\n",
    "    'swamp- water buffalo': 'water buffalo',\n",
    "    'spinifex hopping-mouse': 'spinifex hopping mouse',\n",
    "    'white winged fairy wren': 'white-winged fairywren',\n",
    "    'none': 'empty',\n",
    "    'rhinella marina (cane toad)': 'rhinella marina',\n",
    "    'tropicagama temporalis (swamplands lashtail)': 'tropicagama temporalis',\n",
    "    'varanus scalaris (spotted tree monitor)': 'varanus scalaris',\n",
    "    'pseudonaja textilis (eastern brown snake)': 'pseudonaja textilis',\n",
    "    'straw necked ibis': 'straw-necked ibis',\n",
    "    'grey crowned babbler': 'grey-crowned babbler',\n",
    "    'lophognathus gilberti (gilberts dragon)': 'lophognathus gilberti',\n",
    "    'chlamydosaurus kingii (frilled lizard)': 'chlamydosaurus kingii',\n",
    "    \"byone's\": \"bynoe's gecko\",\n",
    "    'Carlia_Middle Arm 2019': None\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Examples of prefixes that are surely different locations: 'WD', 'GP', 'MA', 'BAT'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_location(p):\n",
    "    p_parts = p.split('/')\n",
    "    basename = os.path.basename(p)\n",
    "    \n",
    "    species = p_parts[-2].split('_')[0]\n",
    "    \n",
    "    if basename.startswith(species):\n",
    "        b = basename.split(species)[1]\n",
    "        if b.startswith('.'):\n",
    "            b = b.split('.')[1]\n",
    "        \n",
    "        if b.startswith('IMG_'):\n",
    "            location = 'unknown'\n",
    "        else:\n",
    "            location = b.split('_')[0]\n",
    "    elif basename.startswith('IMG_'):\n",
    "        location = 'unknown'\n",
    "    else:\n",
    "        location = basename.split('_')[0]\n",
    "    \n",
    "    location = location.strip()\n",
    "    \n",
    "    if '-' in location and not location.startswith('-'):\n",
    "        location = location.split('-')[0]\n",
    "    if location.startswith('-'):\n",
    "        location = location.split('-')[1]\n",
    "    if location.lower().endswith('.jpg'):\n",
    "        location = 'unknown'\n",
    "    if location.startswith('sp.'):\n",
    "        location = location.split('sp.')[1]\n",
    "    if ')' in location:\n",
    "        location = location.split(')')[1]\n",
    "\n",
    "    if location.startswith('None'):\n",
    "        location = location.split('None')[1]\n",
    "    if location.startswith('dog'):\n",
    "        location = location.split('dog')[1]\n",
    "    if location.startswith('Egret'):\n",
    "        location = location.split('Egret')[1]\n",
    "    if location.startswith('Set-up'):\n",
    "        location = location.split('Set-up')[1]\n",
    "    if location.startswith('Lo'):\n",
    "        location = 'unknown'\n",
    "    if len(location) > 7:\n",
    "        location = 'unknown'\n",
    "    if 'C' in location:\n",
    "        location = location.split('C')[0]\n",
    "    if location.startswith('GP'): \n",
    "        location = 'GP'\n",
    "    if location.startswith('GE'):\n",
    "        location = 'GE'\n",
    "    if location.startswith('S'):  # be conservative\n",
    "        location = 'S'\n",
    "    if location.startswith('Grid'):  # be conservative\n",
    "        location = 'Grid'\n",
    "    if location in ['IMG', 'SVG', 'Willie', 'White', 'Silver', 'end', '', 'Short', 'Water', 'blue', 'Set', 'Black', 'SVL']:\n",
    "        location = 'unknown'\n",
    "    \n",
    "    return location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'MA'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_location('Training Images (Gold Standard)/Carlia_Middle Arm 2019/MA-MA25_C1_01352_4-08-2019_12-52.JPG')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 19053/19053 [00:00<00:00, 66790.81it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "19053"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences = []\n",
    "locations = []\n",
    "set_species = set()\n",
    "\n",
    "for p in tqdm(paths):\n",
    "    p_parts = p.split('/')\n",
    "    \n",
    "    location = get_location(p)\n",
    "    locations.append(location)\n",
    "\n",
    "    if location == 'M_A_-_M_A_2_5':\n",
    "        print(species)\n",
    "        print(p)\n",
    "        break\n",
    "    \n",
    "    species = p_parts[-2].split('_')[0].lower()\n",
    "    \n",
    "    if species in species_name_mapping:\n",
    "        species = species_name_mapping[species]\n",
    "    if species is None:\n",
    "        # this directory is a location, species is one above\n",
    "        species = p_parts[-3].lower().split('_')[0]\n",
    "        if species in species_name_mapping:\n",
    "            species = species_name_mapping[species]\n",
    "        if species is None:\n",
    "            print(p)\n",
    "        \n",
    "    set_species.add(species)\n",
    "    \n",
    "    \n",
    "    basename = os.path.basename(p)\n",
    "    b_parts = basename.split('.')[-2]\n",
    "        \n",
    "#     timestamp = datetime(\n",
    "#         year=int(b_parts.split('-')[-2].split('_')[0]),\n",
    "#         month=int(b_parts.split('-')[-2].split('_')[1]),\n",
    "#         day=int(b_parts.split('-')[-1]),\n",
    "#     )\n",
    "    \n",
    "    sequences.append({\n",
    "        'dataset': dataset_name,\n",
    "        'seq_id': f'dummy_{location}_{len(sequences)}',\n",
    "        'location': location,\n",
    "        'class': [species],\n",
    "        'images': [{\n",
    "            'file': p,\n",
    "            'frame_num': 1,\n",
    "            #'datetime': str(timestamp)\n",
    "        }]\n",
    "    })\n",
    "        \n",
    "len(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "21"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "Counter({'GP': 1574,\n",
       "         'unknown': 6407,\n",
       "         'AR': 242,\n",
       "         'BAT': 1301,\n",
       "         'KOO': 357,\n",
       "         'WD': 2745,\n",
       "         'MEL': 208,\n",
       "         'WAD': 414,\n",
       "         'BP': 291,\n",
       "         'BA': 26,\n",
       "         'S': 1080,\n",
       "         'WN': 5,\n",
       "         'BN': 8,\n",
       "         'Grid': 250,\n",
       "         'KNP': 2136,\n",
       "         'MA': 1593,\n",
       "         'GE': 386,\n",
       "         'FGc': 3,\n",
       "         'NMc': 3,\n",
       "         'FOH': 15,\n",
       "         'MAN': 9})"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "location_counter = Counter(locations)\n",
    "len(location_counter)\n",
    "location_counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "198"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'accipiter sp',\n",
       " 'agamidae',\n",
       " 'agile wallaby',\n",
       " 'anilios sp',\n",
       " 'antilopene wallaroo',\n",
       " 'australian magpie',\n",
       " 'australian owlet-nightjar',\n",
       " 'australian white ibis',\n",
       " 'banteng',\n",
       " 'bar-shouldered dove',\n",
       " 'black butcherbird',\n",
       " 'black kite',\n",
       " 'black rat',\n",
       " 'black wallaroo',\n",
       " 'black-faced cuckoo-shrike',\n",
       " 'black-faced woodswallow',\n",
       " 'black-footed tree-rat',\n",
       " 'black-tailed monitor',\n",
       " 'black-tailed treecreeper',\n",
       " 'blue-faced honeyeater',\n",
       " 'blue-tongue lizard',\n",
       " 'blue-winged kookaburra',\n",
       " 'brown goshawk',\n",
       " 'brown honeyeater',\n",
       " 'brown quail',\n",
       " 'brush-tailed mulgara',\n",
       " 'brush-tailed rabbit rat',\n",
       " 'buffalo',\n",
       " 'bush stone-curlew',\n",
       " \"bynoe's gecko\",\n",
       " 'cane toad',\n",
       " 'carlia',\n",
       " 'carlia sp',\n",
       " 'cat',\n",
       " 'cattle',\n",
       " 'chestnut-backed button-quail',\n",
       " 'chlamydosaurus kingii',\n",
       " 'collared sparrowhawk',\n",
       " 'common blue-tongued lizard',\n",
       " 'common bronzewing',\n",
       " 'common brushtail possum',\n",
       " 'common planigale',\n",
       " 'common rock rat',\n",
       " 'common wallaroo',\n",
       " 'conilurus, leggedina, mesembriomys, notomys, pseudoms, zyzomys',\n",
       " 'corvus',\n",
       " 'crested bellbird',\n",
       " 'crimson finch',\n",
       " 'cryptoblepharus sp',\n",
       " 'ctenotus',\n",
       " 'delicate mouse',\n",
       " 'demansia',\n",
       " 'demansia sp',\n",
       " 'dendrelaphis punctulatus',\n",
       " 'diamond dove',\n",
       " 'dingo',\n",
       " 'diporiphora',\n",
       " 'diporiphora bilineata',\n",
       " 'diporiphora sp',\n",
       " 'dog',\n",
       " 'donkey',\n",
       " 'double-barred finch',\n",
       " 'dusky rat',\n",
       " 'eastern brown snake',\n",
       " 'echidna',\n",
       " 'emerald dove',\n",
       " 'empty',\n",
       " 'emu',\n",
       " 'fawn antechinus',\n",
       " 'forest kingfisher',\n",
       " 'frilled lizard',\n",
       " 'frilled lizard select',\n",
       " 'galah',\n",
       " 'gecko sp',\n",
       " \"gilbert's dragon\",\n",
       " 'gouldian finch',\n",
       " 'grassland melomys',\n",
       " 'great bowerbird',\n",
       " 'green tree snake',\n",
       " 'grey butcherbird',\n",
       " 'grey goshawk',\n",
       " 'grey shrike-thrush',\n",
       " 'grey-crowned babbler',\n",
       " 'groote southern leases',\n",
       " 'hooded robin',\n",
       " \"horesfield's bushlark\",\n",
       " 'horse',\n",
       " 'human',\n",
       " 'kakadu',\n",
       " 'kakadu dunnart',\n",
       " 'king brown snake',\n",
       " 'large-tailed nightjar',\n",
       " 'leaden flycatcher',\n",
       " 'leopard ctenotus',\n",
       " 'lesser black whipsnake',\n",
       " 'liopholis striata',\n",
       " 'litoria caerulea',\n",
       " 'little button-quail',\n",
       " 'little corella',\n",
       " 'long-tailed finch',\n",
       " 'lophognathus gilberti',\n",
       " 'lophopnathus',\n",
       " 'magpie-lark',\n",
       " 'masked finch',\n",
       " \"merten's water montitor\",\n",
       " \"mitchell's water monitor\",\n",
       " 'morethia',\n",
       " 'muridae',\n",
       " 'northern brown bandicoot',\n",
       " 'northern fantail',\n",
       " 'northern hopping mouse',\n",
       " 'northern nailtail wallaby',\n",
       " 'northern quoll',\n",
       " 'northern rosella',\n",
       " 'orange-footed scrubfowl',\n",
       " 'oriental cuckoo',\n",
       " 'pale field rat',\n",
       " 'pale field-rat',\n",
       " 'papuan whip snake',\n",
       " 'partridge pidgeon',\n",
       " 'partridge pigeon',\n",
       " 'peaceful dove',\n",
       " 'pheasant coucal',\n",
       " 'pictorella mannikin',\n",
       " 'pied butcherbird',\n",
       " 'pig',\n",
       " 'planigale',\n",
       " 'pseudechis weigeli',\n",
       " 'pseudonaja textilis',\n",
       " 'psuedonaja nuchalis',\n",
       " 'rainbow bee-eater',\n",
       " 'rainbow pitta',\n",
       " 'rattus mus',\n",
       " 'red cheeked dunnart',\n",
       " 'red tailed black cockatoo',\n",
       " 'red-backed button-quail',\n",
       " 'red-backed fairy-wren',\n",
       " 'red-cheeked dunnart',\n",
       " 'red-chested button-quail',\n",
       " 'red-tailed black-cockatoo',\n",
       " 'red-winged parrot',\n",
       " 'rhinella marina',\n",
       " 'ridge-tailed monitor',\n",
       " 'rufous songlark',\n",
       " 'rufous whislter',\n",
       " 'sand goanna',\n",
       " 'sandstone antechinus',\n",
       " 'sandy inland mouse',\n",
       " 'scincidae',\n",
       " 'shining flycatcher',\n",
       " 'short-eared rock wallaby',\n",
       " 'silver-backed butcher bird',\n",
       " 'singing honeyeater',\n",
       " 'small mammal',\n",
       " 'sminthopsis',\n",
       " 'sminthopsis sp',\n",
       " 'southern boobook',\n",
       " 'spangled drongo',\n",
       " 'spinifex hopping mouse',\n",
       " 'spinifex pigeon',\n",
       " 'spotted nightjar',\n",
       " 'spotted tree monitor',\n",
       " 'straw-necked ibis',\n",
       " 'stripe-faced dunnart',\n",
       " 'sugar glider',\n",
       " 'sulphur-crested cockatoo',\n",
       " 'swamplands lashtail',\n",
       " 'tawny frogmouth',\n",
       " 'tiliqua multifasciata',\n",
       " 'tiliqua scincoides',\n",
       " 'torresian crow',\n",
       " 'tropicagama',\n",
       " 'tropicagama temporalis',\n",
       " 'varanus acanthurus',\n",
       " 'varanus gilleni',\n",
       " 'varanus glebopalma',\n",
       " 'varanus gouldii',\n",
       " 'varanus mertensi',\n",
       " 'varanus mitchelli',\n",
       " 'varanus panoptes',\n",
       " 'varanus scalaris',\n",
       " 'varanus sp',\n",
       " 'varanus tristis',\n",
       " 'variegated fairy-wren',\n",
       " 'various frogs',\n",
       " 'various small skinks',\n",
       " 'water buffalo',\n",
       " 'water python',\n",
       " 'water rat',\n",
       " 'west davs',\n",
       " 'western chestnut mouse',\n",
       " 'white-bellied sea-eagle',\n",
       " 'white-quilled rock-pigeon',\n",
       " 'white-winged fairywren',\n",
       " 'willie wagtail',\n",
       " 'yellow-spotted monitor',\n",
       " 'yellow-throated miner',\n",
       " 'yellow-tinted honeyeater'}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set_species)\n",
    "set_species"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'dataset': 'au_nt_gov_kerr',\n",
       " 'seq_id': 'dummy_WD_18853',\n",
       " 'location': 'WD',\n",
       " 'class': ['sandy inland mouse'],\n",
       " 'images': [{'file': 'TrainingData_3/small mammals/west_davs_1of2/sandy inland mouse/WD-B1400C1_08504_17-11-2019_23-39.JPG',\n",
       "   'frame_num': 1}]}"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences[-200]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'dataset': 'au_nt_gov_kerr',\n",
       " 'seq_id': 'dummy_unknown_1000',\n",
       " 'location': 'unknown',\n",
       " 'class': ['brush-tailed rabbit rat'],\n",
       " 'images': [{'file': 'Training Images (Custom Vision)/Brush-tailed Rabbit-rat/Brush-tailed Rabbit-ratIMG_0008.JPG',\n",
       "   'frame_num': 1}]}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences[1000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2 - Pass the schema check\n",
    "\n",
    "Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.\n",
    "\n",
    "If the format conforms, the following messages will be printed:\n",
    "\n",
    "```\n",
    "Verified that the sequence items meet requirements not captured by the schema.\n",
    "Verified that the sequence items conform to the schema.\n",
    "```\n",
    "\n",
    "For large datasets, the second step will take some time (~ a minute). \n",
    "\n",
    "Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 2.28 s, sys: 4.49 ms, total: 2.29 s\n",
      "Wall time: 2.29 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output_temp, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences, f, indent=1, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2b - copy images to flat folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def copy_file(src_path, dst_path):\n",
    "    return copyfile(src_path, dst_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 19053/19053 [00:00<00:00, 117230.38it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 115 ms, sys: 52.6 ms, total: 168 ms\n",
      "Wall time: 165 ms\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "path_pairs = []\n",
    "for seq in tqdm(sequences):\n",
    "    \n",
    "    if 'empty' in seq['class']:\n",
    "        continue\n",
    "    \n",
    "    seq_id = seq['seq_id']\n",
    "    for im in seq['images']:\n",
    "        src_path = os.path.join(container_root, im['file'])\n",
    "        assert os.path.exists(src_path), src_path\n",
    "        frame = 1\n",
    "        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12c', \n",
    "                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')\n",
    "        path_pairs.append((src_path, dst_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18842"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "('/mink_disk_0/camtraps/nt-gov-au/Training Images (Custom Vision)/Brush-tailed Rabbit-rat/Brush-tailed Rabbit-ratIMG_0008.JPG',\n",
       " '/mink_disk_0/camtraps/imerit12c/au_nt_gov_kerr.seqdummy_unknown_1000.frame1.jpg')"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(path_pairs)\n",
    "path_pairs[1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 25.7 s, sys: 1min, total: 1min 26s\n",
      "Wall time: 34 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "with ThreadPool(8) as pool:\n",
    "    dst_paths = pool.starmap(copy_file, path_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18842"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dst_paths)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
